{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Imports, etc."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/philip/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n",
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "from AugBoost import AugBoostClassifier as ABC\n",
    "from AugBoost import AugBoostRegressor as ABR\n",
    "\n",
    "from sklearn.model_selection import train_test_split, KFold\n",
    "from sklearn.metrics import log_loss\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "import pickle\n",
    "import datetime\n",
    "import gc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_path = './data/'\n",
    "result_path = './results/'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Classification experiments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(data_path + 'classification_data_processed.pkl', 'rb') as f:\n",
    "    classification_datasets = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "kf = KFold(n_splits=3, shuffle=True)\n",
    "experiment_details = 'classification_nn_150_estimators_3_subsets_10_trees-between-updates'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "classification_datasets_docs = []\n",
    "prev_time = datetime.datetime.now()\n",
    "for i, dataset in enumerate(classification_datasets):\n",
    "    X, y, dataset_name = dataset.values()\n",
    "    print('**********', 'Dataset: ', dataset_name, '**********')\n",
    "    folds_docs = []\n",
    "    for train_index, test_index in kf.split(X):\n",
    "        print('~~~ new fold ~~~')\n",
    "        X_train, X_val = X.iloc[train_index], X.iloc[test_index]\n",
    "        y_train, y_val = y.iloc[train_index], y.iloc[test_index]\n",
    "        prev_time = datetime.datetime.now()\n",
    "        model = ABC(n_estimators = 150, max_epochs = 1000, learning_rate = 0.1, \\\n",
    "            n_features_per_subset = round(len(X_train.columns)/3), trees_between_feature_update = 10,\\\n",
    "            augmentation_method = 'nn', save_mid_experiment_accuracy_results = False)\n",
    "        model.fit(X = X_train, y = y_train, X_val = X_val, y_val = y_val) \n",
    "        new_time = datetime.datetime.now()\n",
    "        training_secs = (new_time - prev_time).total_seconds()\n",
    "        score = log_loss(y_val, model.predict_proba(X_val), labels = y.unique())\n",
    "        folds_docs.append((score, test_index, training_secs))\n",
    "        del model\n",
    "        gc.collect()\n",
    "    classification_datasets_docs.append((i, dataset_name, folds_docs))\n",
    "    with open(result_path + 'datasets_docs_' + experiment_details + '.pkl', 'wb') as f:\n",
    "        pickle.dump(classification_datasets_docs, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "results = []\n",
    "for dataset in classification_datasets_docs:\n",
    "    temp_losses = []\n",
    "    for j in range(len(dataset[2])):\n",
    "        temp_losses.append(dataset[2][j][0])\n",
    "    results.append([dataset[1], np.mean(temp_losses), np.std(temp_losses)])\n",
    "results = pd.DataFrame(results)\n",
    "results.columns = ['dataset', 'mean (log-loss)', 'std (log-loss)']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Regression experiments"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(data_path + 'regression_data_processed.pkl', 'rb') as f:\n",
    "    regression_datasets = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "kf = KFold(n_splits=3, shuffle=True)\n",
    "experiment_details = 'regression_nn_150_estimators_3_subsets_10_trees-between-updates'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "regression_datasets_docs = []\n",
    "prev_time = datetime.datetime.now()\n",
    "for i, dataset in enumerate(regression_datasets):\n",
    "    X, y, dataset_name = dataset.values()\n",
    "    print('**********', 'Dataset: ', dataset_name, '**********')\n",
    "    folds_docs = []\n",
    "    for train_index, test_index in kf.split(X):\n",
    "        print('~~~ new fold ~~~')\n",
    "        X_train, X_val = X.iloc[train_index], X.iloc[test_index]\n",
    "        y_train, y_val = y.iloc[train_index], y.iloc[test_index]\n",
    "        prev_time = datetime.datetime.now()\n",
    "        model = ABR(n_estimators = 150, max_epochs = 1000, learning_rate = 0.1, \\\n",
    "            n_features_per_subset = round(len(X_train.columns)/3), trees_between_feature_update = 10,\\\n",
    "            augmentation_method = 'nn', save_mid_experiment_accuracy_results = False)\n",
    "        model.fit(X = X_train, y = y_train, X_val = X_val, y_val = y_val) \n",
    "        new_time = datetime.datetime.now()\n",
    "        training_secs = (new_time - prev_time).total_seconds()\n",
    "        score = log_loss(y_val, model.predict_proba(X_val), labels = y.unique())\n",
    "        folds_docs.append((score, test_index, training_secs))\n",
    "        del model\n",
    "        gc.collect()\n",
    "    regression_datasets_docs.append((i, dataset_name, folds_docs))\n",
    "    with open(result_path + 'datasets_docs_' + experiment_details + '.pkl', 'wb') as f:\n",
    "        pickle.dump(regression_datasets_docs, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "results = []\n",
    "for dataset in regression_datasets_docs:\n",
    "    temp_losses = []\n",
    "    for j in range(len(dataset[2])):\n",
    "        temp_losses.append(dataset[2][j][0])\n",
    "    results.append([dataset[1], np.mean(temp_losses), np.std(temp_losses)])\n",
    "results = pd.DataFrame(results)\n",
    "results.columns = ['dataset', 'mean (log-loss)', 'std (log-loss)']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
